# ! pip install fasttext
# ! pip install ktrain
# ! pip install shap
import fasttext
import ktrain
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import shap
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectPercentile, f_classif
import tensorflow as tf
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
Pay special attention to the distribution of the positive and negative examples in the first task as well as distribution of the classes in the second task.
DATA_DIR = '/content/drive/MyDrive/NLP'
files = {
'task_1': {
'train': {
'text': f'{DATA_DIR}/task_6-1/training_set_clean_only_text.txt',
'class': f'{DATA_DIR}/task_6-1/training_set_clean_only_tags.txt'
},
'test': {
'text': f'{DATA_DIR}/task_6-1/test_set_clean_only_text.txt',
'class': f'{DATA_DIR}/task_6-1/test_set_clean_only_tags.txt'
}
},
'task_2': {
'train': {
'text': f'{DATA_DIR}/task_6-2/training_set_clean_only_text.txt',
'class': f'{DATA_DIR}/task_6-2/training_set_clean_only_tags.txt'
},
'test': {
'text': f'{DATA_DIR}/task_6-2/test_set_only_text.txt',
'class': f'{DATA_DIR}/task_6-2/test_set_only_tags.txt'
}
}
}
Preprocessing
task1 = {'train': {'text': [], 'class': []}, 'test': {'text': [], 'class': []}}
task2 = {'train': {'text': [], 'class': []}, 'test': {'text': [], 'class': []}}
def create_test_train_ds(file_dict: dict, task_dataset):
with open(file_dict['text']) as file:
for line in file.readlines():
task_dataset['text'].append(line.strip())
with open(file_dict['class']) as file:
for line in file.readlines():
task_dataset['class'].append(int(line.strip()))
def create_task_dataset(file_dict, task_dataset):
create_test_train_ds(file_dict['train'], task_dataset['train'])
create_test_train_ds(file_dict['test'], task_dataset['test'])
create_task_dataset(files['task_1'], task1)
create_task_dataset(files['task_2'], task2)
task1_train_df = pd.DataFrame(task1['train'])
task1_test_df = pd.DataFrame(task1['test'])
task2_train_df = pd.DataFrame(task2['train'])
task2_test_df = pd.DataFrame(task2['test'])
Task 1 - o wiele więcej jest instancji bez raniących wyrażeń niż z nimi.
task1_train_df.plot.hist('class')
<matplotlib.axes._subplots.AxesSubplot at 0x7f93e135e790>
task1_test_df.plot.hist('class')
<matplotlib.axes._subplots.AxesSubplot at 0x7f93e143a250>
Task 2 - miażdżącą przewagę mają wyrażenia neutralne, o wiele mniej jest wyrażeń zawierających hejt, najmniej które zawierają cyberprzemoc.
task2_train_df.plot.hist('class')
<matplotlib.axes._subplots.AxesSubplot at 0x7f93e084cd10>
task2_test_df.plot.hist('class')
<matplotlib.axes._subplots.AxesSubplot at 0x7f93e07de150>
Zbiory są BARDZO niezrównoważone, więc zrównoważę ich zbiory treningowe.
def balance_dataset(dataframe):
max_size = dataframe['class'].value_counts().max()
data_list = [dataframe]
for class_index, group in dataframe.groupby('class'):
data_list.append(group.sample(max_size - len(group), replace=True))
dataframe = pd.concat(data_list).sample(frac=1)
dataframe.reset_index(drop=True)
return dataframe
balanced_t1 = balance_dataset(task1_train_df)
balanced_t1.plot.hist('class')
<matplotlib.axes._subplots.AxesSubplot at 0x7f93e0754550>
balanced_t2 = balance_dataset(task2_train_df)
balanced_t2.plot.hist('class')
<matplotlib.axes._subplots.AxesSubplot at 0x7f93e0708210>
pred_labels = {'task1': {'tfidf': None, 'fasttext': None, 'transformer': None}, 'task2': {'tfidf': None, 'fasttext': None, 'transformer': None}}
def eval_tfidf_classifier(train, test):
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5)
features_train = train['text']
features_test = test['text']
labels_train = train['class']
labels_test = test['class']
features_train = vectorizer.fit_transform(features_train).toarray()
features_test = vectorizer.transform(features_test).toarray()
# selector = SelectPercentile(f_classif, percentile=10)
# selector.fit(features_train, labels_train)
# features_train = selector.transform(features_train).toarray()
# features_test = selector.transform(features_test).toarray()
model = GaussianNB()
model.fit(features_train, labels_train)
return model.predict(features_test), model, features_test, vectorizer
# score_train = model.score(features_train, labels_train)
# score_test = model.score(features_test, labels_test)
# print('Train set score:', score_train)
# print('Test set score:', score_test)
pred_labels['task1']['tfidf'], tfidf_model1, tfidf_test1, tfidf_vect1 = eval_tfidf_classifier(balanced_t1, task1_test_df)
pred_labels['task2']['tfidf'], tfidf_model2, tfidf_test2, tfidf_vect2 = eval_tfidf_classifier(balanced_t2, task2_test_df)
def prepare_train_data_for_fasttext(train_data, output_path):
with open(output_path, 'w+') as file:
for _, row in train_data.iterrows():
data = row['text'].replace('\n', ' ')
file.write(f'__label__{row["class"]} {data}\n')
prepare_train_data_for_fasttext(balanced_t1, f'{DATA_DIR}/fasttext_train1.txt')
prepare_train_data_for_fasttext(balanced_t2, f'{DATA_DIR}/fasttext_train2.txt')
def eval_fasttext(filepath, data):
classifier = fasttext.train_supervised(input=filepath)
predictions = [classifier.predict(sent) for sent in data['text']]
predictions = [int(pred[0][0].replace('__label__', '')) for pred in predictions]
return predictions, classifier
pred_labels['task1']['fasttext'], fasttext1 = eval_fasttext(f'{DATA_DIR}/fasttext_train1.txt', task1_test_df)
pred_labels['task2']['fasttext'], fasttext2 = eval_fasttext(f'{DATA_DIR}/fasttext_train2.txt', task2_test_df)
def preprocess_for_transformer(data):
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
def tokenize_function(examples):
return tokenizer(examples, padding="max_length", truncation=True)
tokenized_datasets = data.copy()
counted = [tokenize_function(data) for data in tokenized_datasets['text']]
train_features = {
'input_ids': [count['input_ids'] for count in counted],
'token_type_ids': [count['token_type_ids'] for count in counted],
'attention_mask': [count['attention_mask'] for count in counted]
}
train_tf_dataset = tf.data.Dataset.from_tensor_slices((train_features, data["class"]))
train_tf_dataset = train_tf_dataset.shuffle(len(train_tf_dataset)).batch(8)
return train_tf_dataset
train1_prep = preprocess_for_transformer(balanced_t1)
val1_prep = preprocess_for_transformer(task1_test_df)
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
def eval_transformer(labels, train_dataset, eval_dataset):
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=labels)
model.compile(
optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=tf.metrics.SparseCategoricalAccuracy(),
)
model.fit(train_dataset, validation_data=eval_dataset, epochs=3)
return model
m = eval_transformer(2, train1_prep, val1_prep)
Epoch 1/3 2298/2298 [==============================] - 4025s 2s/step - loss: 0.7047 - sparse_categorical_accuracy: 0.4998 - val_loss: 0.7562 - val_sparse_categorical_accuracy: 0.1340 Epoch 2/3 2298/2298 [==============================] - 3998s 2s/step - loss: 0.7008 - sparse_categorical_accuracy: 0.5078 - val_loss: 0.5949 - val_sparse_categorical_accuracy: 0.8660 Epoch 3/3 2298/2298 [==============================] - 3997s 2s/step - loss: 0.6982 - sparse_categorical_accuracy: 0.5021 - val_loss: 0.6653 - val_sparse_categorical_accuracy: 0.8660
m.save_pretrained(f'{DATA_DIR}/task1_model')
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-47-de930f983c9a> in <module>() ----> 1 m.save_pretrained(f'{DATA_DIR}/task1_model') NameError: name 'm' is not defined
train2_prep = preprocess_for_transformer(balanced_t2)
val2_prep = preprocess_for_transformer(task2_test_df)
m2 = eval_transformer(3, train2_prep, val2_prep)
m2.save_pretrained(f'{DATA_DIR}/task2_model')
m2 = TFAutoModelForSequenceClassification.from_pretrained(f'{DATA_DIR}/task2_model')
pred_labels['task2']['transformer'] = m2.predict(val2_prep)
tf_prediction = tf.nn.softmax(pred_labels['task2']['transformer'][0], axis=1).numpy()
pred_labels['task2']['transformer'] = [0 if el[0] > el[1] else 1 for el in tf_prediction]
m = TFAutoModelForSequenceClassification.from_pretrained(f'{DATA_DIR}/task1_model')
pred_labels['task1']['transformer'] = m.predict(val1_prep)
tf_prediction = tf.nn.softmax(pred_labels['task1']['transformer'][0], axis=1).numpy()
pred_labels['task1']['transformer'] = [0 if el[0] > el[1] else 1 for el in tf_prediction]
Select the appropriate measures (from accuracy, F1, macro/micro F1, MCC) to compare the results.
def print_metrics(y_true, y_pred):
print(f'Accuracy: {accuracy_score(y_true, y_pred)}')
print(f'F1: {f1_score(y_true, y_pred, average="weighted")}')
print(f'Macro F1: {f1_score(y_true, y_pred, average="macro")}')
print(f'Micro F1: {f1_score(y_true, y_pred, average="micro")}')
print(f'MCC: {matthews_corrcoef(y_true, y_pred)}')
print('TFIDF')
print_metrics(task1_test_df['class'], pred_labels['task1']['tfidf'])
print('FASTTEXT')
print_metrics(task1_test_df['class'], pred_labels['task1']['fasttext'])
print('TRANSFORMER')
print_metrics(task1_test_df['class'], pred_labels['task1']['transformer'])
TFIDF Accuracy: 0.783 F1: 0.7878030091930415 Macro F1: 0.5534161605726151 Micro F1: 0.7829999999999999 MCC: 0.10745050407812812 FASTTEXT Accuracy: 0.877 F1: 0.8494275537721001 Macro F1: 0.6226519286167892 Micro F1: 0.8769999999999999 MCC: 0.3111077063768808 TRANSFORMER Accuracy: 0.866 F1: 0.8038113612004287 Macro F1: 0.4640943193997856 Micro F1: 0.866 MCC: 0.0
invalid value encountered in double_scalars
print('TFIDF')
print_metrics(task1_test_df['class'], pred_labels['task2']['tfidf'])
print('FASTTEXT')
print_metrics(task1_test_df['class'], pred_labels['task2']['fasttext'])
print('TRANSFORMER')
print_metrics(task1_test_df['class'], pred_labels['task2']['transformer'])
TFIDF Accuracy: 0.776 F1: 0.7774204426600595 Macro F1: 0.3257908913986824 Micro F1: 0.776 MCC: 0.06054021107458979 FASTTEXT Accuracy: 0.855 F1: 0.8123805420064407 Macro F1: 0.32823970319591705 Micro F1: 0.855 MCC: 0.14189520715595239 TRANSFORMER Accuracy: 0.134 F1: 0.03166843033509701 Macro F1: 0.11816578483245152 Micro F1: 0.134 MCC: 0.0
invalid value encountered in double_scalars
(for the best classifier) and compare the decisions of each classifier on these examples using LIME.
BEST WAS FASTTEXT SO IT WILL BE USED.
task1_test_df['tfidf'] = pred_labels['task1']['tfidf']
task1_test_df['fasttext'] = pred_labels['task1']['fasttext']
task1_test_df['transformer'] = pred_labels['task1']['transformer']
task2_test_df['tfidf'] = pred_labels['task2']['tfidf']
task2_test_df['fasttext'] = pred_labels['task2']['fasttext']
task2_test_df['transformer'] = pred_labels['task2']['transformer']
task1_test_df[task1_test_df['class']==task1_test_df['fasttext']]
# tfidf_t = task1_test_df[task1_test_df['class']==task1_test_df['tfidf']]
# tfidf_tp = tfidf_t[tfidf_t['tfidf'] == 1].iloc[0]
# tfidf_tn = tfidf_t[tfidf_t['tfidf'] == 0].iloc[0]
# tfidf_f = task1_test_df[task1_test_df['class']!=task1_test_df['tfidf']]
# tfidf_fp = tfidf_f[tfidf_f['tfidf'] == 1].iloc[0]
# tfidf_fn = tfidf_f[tfidf_f['tfidf'] == 0].iloc[0]
fasttext_t = task1_test_df[task1_test_df['class']==task1_test_df['fasttext']]
fasttext_tp = fasttext_t[fasttext_t['fasttext'] == 1].iloc[0]
fasttext_tn = fasttext_t[fasttext_t['fasttext'] == 0].iloc[0]
fasttext_f = task1_test_df[task1_test_df['class']!=task1_test_df['fasttext']]
fasttext_fp = fasttext_f[fasttext_f['fasttext'] == 1].iloc[0]
fasttext_fn = fasttext_f[fasttext_f['fasttext'] == 0].iloc[0]
# transformer_t = task1_test_df['transformer' == 'class']
# transformer_tp = transformer_t['transformer' == 1]
# transformer_tn = transformer_t['transformer' == 0]
# transformer_f = task1_test_df['transformer' != 'class']
# transformer_fp = transformer_f['transformer' == 1]
# transformer_fn = transformer_f['transformer' == 0]
# !pip3 install lime
from lime import lime_text
from lime.lime_text import LimeTextExplainer
def lime_model(model, data, f_no):
explainer = LimeTextExplainer(class_names=['bez', 'z'])
exp = explainer.explain_instance(data, model.predict_proba, num_features=f_no)
fig = exp.as_pyplot_figure()
# fig.show()
exp.show_in_notebook(text=True)
exp.as_list()
return exp
from sklearn.base import TransformerMixin
class DenseTransformer(TransformerMixin):
def fit(self, X, y=None, **fit_params):
return self
def transform(self, X, y=None, **fit_params):
return X.toarray()
from sklearn.pipeline import make_pipeline
vectorizer = tfidf_vect1
c = make_pipeline(vectorizer, DenseTransformer(), tfidf_model1)
txt = fasttext_tp['text']
txt
'@anonymized_account Dokładnie wie co mówi. A Ty pajacu poczytaj ustawę domsie dowiesz kto decyduje o wysokości zarobków w samorządach.'
lime_model(c, txt, len(txt.split()))
<lime.explanation.Explanation at 0x7f3e550cacd0>
txt = fasttext_tn['text']
txt
'@anonymized_account Spoko, jak im Duda z Morawieckim zamówią po pięć piw to wszystko będzie ok.'
lime_model(c, txt, len(txt.split()))
<lime.explanation.Explanation at 0x7f3e5548d210>
txt = fasttext_fp['text']
txt
'@anonymized_account @anonymized_account Kto mieczem wojuje, ten od pochwy ginie'
lime_model(c, txt, len(txt.split()))
<lime.explanation.Explanation at 0x7f3e54bf9a50>
txt = fasttext_fn['text']
txt
'@anonymized_account Tej szmaty się nie komentuje'
lime_model(c, txt, len(txt.split()))
<lime.explanation.Explanation at 0x7f3e554709d0>
from sklearn.base import TransformerMixin
class TraTransformer(TransformerMixin):
def fit(self, X, y=None, **fit_params):
return self
def transform(self, X, y=None, **fit_params):
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
def tokenize_function(examples):
return tokenizer(examples, padding="max_length", truncation=True)
counted = [tokenize_function(data) for data in X]
train_features = {
'input_ids': [count['input_ids'] for count in counted],
'token_type_ids': [count['token_type_ids'] for count in counted],
'attention_mask': [count['attention_mask'] for count in counted]
}
train_tf_dataset = tf.data.Dataset.from_tensor_slices((train_features['input_ids'], train_features['token_type_ids'], train_features['attention_mask']))
# train_tf_dataset = tf.data.Dataset.from_tensor_slices((train_features, data["class"]))
train_tf_dataset = train_tf_dataset.shuffle(len(train_tf_dataset)).batch(8)
return train_tf_dataset
c = make_pipeline(TraTransformer(), m)
c.predict(['ala ma kota'])
TFSequenceClassifierOutput([('logits',
array([[0.41266397, 0.33458605]], dtype=float32))])
def lime_model(model, data, f_no):
explainer = LimeTextExplainer(class_names=['bez', 'z'])
exp = explainer.explain_instance(data, tr_predict, num_features=f_no)
fig = exp.as_pyplot_figure()
# fig.show()
exp.show_in_notebook(text=True)
exp.as_list()
return exp
def tr_predict(txt):
p = c.predict(txt)
rd = tf.nn.softmax(p[0], axis=1).numpy()
return rd
txt = fasttext_tp['text']
txt
'@anonymized_account Dokładnie wie co mówi. A Ty pajacu poczytaj ustawę domsie dowiesz kto decyduje o wysokości zarobków w samorządach.'
lime_model(c, txt, len(txt.split()))
<lime.explanation.Explanation at 0x7f3e54eab410>
txt = fasttext_tn['text']
txt
'@anonymized_account Spoko, jak im Duda z Morawieckim zamówią po pięć piw to wszystko będzie ok.'
lime_model(c, txt, len(txt.split()))
<lime.explanation.Explanation at 0x7f3e55b18d50>
txt = fasttext_fp['text']
txt
'@anonymized_account @anonymized_account Kto mieczem wojuje, ten od pochwy ginie'
lime_model(c, txt, len(txt.split()))
<lime.explanation.Explanation at 0x7f3e55cdf050>
txt = fasttext_fn['text']
txt
'@anonymized_account Tej szmaty się nie komentuje'
lime_model(c, txt, len(txt.split()))
<lime.explanation.Explanation at 0x7f3e56672e10>
def lime_model(data, f_no):
explainer = LimeTextExplainer(class_names=['bez', 'z'])
exp = explainer.explain_instance(data, classifier_fn= lambda x: fasttext_sk(x, fasttext1, 2), num_features=f_no)
fig = exp.as_pyplot_figure()
# fig.show()
exp.show_in_notebook(text=True)
exp.as_list()
return exp
def fasttext_sk(samples, model, k=2):
predic = model.predict(samples, k)
rets = []
for idx, sample in enumerate(samples):
p_classes = predic[0][idx]
p_class = [int(pred.replace('__label__', '')) for pred in p_classes]
p_preds = predic[1][idx]
p_pred = [round(float(pred), 2) for pred in p_preds]
ret = [i for i in range(k)]
for class_, prob in zip(p_class, p_pred):
ret[class_] = prob
rets.append(ret)
return np.array(rets)
lime_model('jesteś głupi', 2)
<lime.explanation.Explanation at 0x7f3e558d2690>
txt = fasttext_tp['text']
txt
'@anonymized_account Dokładnie wie co mówi. A Ty pajacu poczytaj ustawę domsie dowiesz kto decyduje o wysokości zarobków w samorządach.'
lime_model(txt, len(txt.split()))
<lime.explanation.Explanation at 0x7f3e55b82690>
txt = fasttext_tn['text']
txt
'@anonymized_account Spoko, jak im Duda z Morawieckim zamówią po pięć piw to wszystko będzie ok.'
lime_model(txt, len(txt.split()))
<lime.explanation.Explanation at 0x7f3e563c4b90>
txt = fasttext_fp['text']
txt
'@anonymized_account @anonymized_account Kto mieczem wojuje, ten od pochwy ginie'
lime_model(txt, len(txt.split()))
<lime.explanation.Explanation at 0x7f3e558cf650>
txt = fasttext_fn['text']
txt
'@anonymized_account Tej szmaty się nie komentuje'
lime_model(txt, len(txt.split()))
<lime.explanation.Explanation at 0x7f3e56dd2b90>
fasttext_t = task2_test_df[task2_test_df['class']==task2_test_df['fasttext']]
fasttext_tp = fasttext_t[fasttext_t['fasttext'] == 1].iloc[0]
fasttext_tn = fasttext_t[fasttext_t['fasttext'] == 0].iloc[0]
fasttext_f = task2_test_df[task2_test_df['class']!=task2_test_df['fasttext']]
fasttext_fp = fasttext_f[fasttext_f['fasttext'] == 1].iloc[0]
fasttext_fn = fasttext_f[fasttext_f['fasttext'] == 0].iloc[0]
fasttext_tp
text @anonymized_account @anonymized_account @anony... class 1 tfidf 0 fasttext 1 transformer 1 Name: 746, dtype: object
c.predict_proba([fasttext_tp['text']])
array([[1., 0., 0.]])
def lime_model(model, data, f_no):
explainer = LimeTextExplainer(class_names=['bez', 'cyberbulling', 'hejt'])
exp = explainer.explain_instance(data, model.predict_proba, num_features=f_no, labels=[0, 1, 2])
fig = exp.as_pyplot_figure()
# fig.show()
exp.show_in_notebook(text=True)
exp.as_list()
return exp
from sklearn.pipeline import make_pipeline
vectorizer = tfidf_vect2
c = make_pipeline(vectorizer, DenseTransformer(), tfidf_model2)
txt = fasttext_tp['text']
lime_model(c, txt, len(txt.split()))
<lime.explanation.Explanation at 0x7fd642f836d0>
txt = fasttext_tn['text']
lime_model(c, txt, len(txt.split()))
<lime.explanation.Explanation at 0x7fd5c2335d90>
txt = fasttext_fp['text']
e = lime_model(c, txt, len(txt.split()))
e.show_in_notebook(text=True, labels=(2,))
txt = fasttext_fn['text']
lime_model(c, txt, len(txt.split()))
<lime.explanation.Explanation at 0x7fd5b176d9d0>
c = make_pipeline(TraTransformer(), m2)
def tr_predict(txt):
p = c.predict(txt)
rd = tf.nn.softmax(p[0], axis=1).numpy()
return rd
def lime_model(model, data, f_no):
explainer = LimeTextExplainer(class_names=['bez', 'cyberbulling', 'hejt'])
exp = explainer.explain_instance(data, tr_predict, num_features=f_no, labels=[0,1,2])
fig = exp.as_pyplot_figure()
# fig.show()
exp.show_in_notebook(text=True, labels=(2,))
exp.as_list()
return exp
txt = fasttext_tp['text']
e = lime_model(c, txt, len(txt.split()))
e.show_in_notebook(text=True, labels=(2,))
txt = fasttext_tn['text']
e = lime_model(c, txt, len(txt.split()))
e.show_in_notebook(text=True, labels=(2,))
txt = fasttext_fp['text']
e = lime_model(c, txt, len(txt.split()))
txt = fasttext_fn['text']
e = lime_model(c, txt, len(txt.split()))
def fasttext_sk(samples, model, k=3):
predic = model.predict(samples, k)
rets = []
for idx, sample in enumerate(samples):
p_classes = predic[0][idx]
p_class = [int(pred.replace('__label__', '')) for pred in p_classes]
p_preds = predic[1][idx]
p_pred = [round(float(pred), 2) for pred in p_preds]
ret = [i for i in range(k)]
for class_, prob in zip(p_class, p_pred):
ret[class_] = prob
rets.append(ret)
return np.array(rets)
def lime_model(data, f_no):
explainer = LimeTextExplainer(class_names=['bez', 'cyberbulling', 'hejt'])
exp = explainer.explain_instance(data, classifier_fn= lambda x: fasttext_sk(x, fasttext2, 3), num_features=f_no, labels=[0, 1, 2])
fig = exp.as_pyplot_figure()
# fig.show()
exp.show_in_notebook(text=True)
exp.as_list()
return exp
txt = fasttext_tp['text']
e = lime_model(txt, len(txt.split()))
e.show_in_notebook(text=True, labels=(2,))
txt = fasttext_tn['text']
lime_model(txt, len(txt.split()))
<lime.explanation.Explanation at 0x7f8b04e09fd0>
txt = fasttext_fp['text']
lime_model(txt, len(txt.split()))
<lime.explanation.Explanation at 0x7f8b09736fd0>
txt = fasttext_fn['text']
lime_model(txt, len(txt.split()))
<lime.explanation.Explanation at 0x7f8b098a3710>
For both tasks best is fasttext classifier:
TASK 1
TFIDF Accuracy: 0.783 F1: 0.7878030091930415 Macro F1: 0.5534161605726151 Micro F1: 0.7829999999999999 MCC: 0.10745050407812812
FASTTEXT Accuracy: 0.877 F1: 0.8494275537721001 Macro F1: 0.6226519286167892 Micro F1: 0.8769999999999999 MCC: 0.3111077063768808
TRANSFORMER Accuracy: 0.866 F1: 0.8038113612004287 Macro F1: 0.4640943193997856 Micro F1: 0.866 MCC: 0.0
TASK 2
TFIDF Accuracy: 0.776 F1: 0.7774204426600595 Macro F1: 0.3257908913986824 Micro F1: 0.776 MCC: 0.06054021107458979
FASTTEXT Accuracy: 0.855 F1: 0.8123805420064407 Macro F1: 0.32823970319591705 Micro F1: 0.855 MCC: 0.14189520715595239
TRANSFORMER Accuracy: 0.134 F1: 0.03166843033509701 Macro F1: 0.11816578483245152 Micro F1: 0.134 MCC: 0.0
Both task datasets are very imbalanced so accuracy is not a good metric (especially if it is high - model could only answer 0 for every answer and accuracy would still be high). But F1 and MCC are good metrics for this and Fasttext was the best for both tasks.
For first task next was Transformer and the worst was TFIDF. But F1 metrics here are all kind of high (>0.78) here.
For second task second best model was TFIDF and the worst (really, really bad performance) had Transformer (maybe it should be thought for more than 3 epochs, but one epoch lasted for 1.5 hour).
Fasttext - I have higher results which is... disturbing. (F1 for task1|F1-min task2: 0.85|0.88 vs 0.4135|0.4722)
This is for transformers so:
For first task - let's say yes, because on test dataset it was even high but let's face it on training dataset the acuracy was very (low sparse_categorical_accuracy: 0.5021).
For second task - it's not even worth commenting, it is really low. (which could be useful in 2-class classification :D )
First - analysis
Disclaimer 1 - more results is visible when opening this notebook as html or in colab https://colab.research.google.com/drive/10DcPgyE7Q3wRO3lE_ujK3dUuqK2RUIfM?usp=sharing - some visualizations are missing in ipynb opended in jupyter
Disclaimer 2 - Positive score - there is CB in sentence; positive word - it is just positive (or normal) - doesn't bring CB to the sentence :D sorry for this mess
task 1
**TP
'@anonymized_account Dokładnie wie co mówi. A Ty pajacu poczytaj ustawę domsie dowiesz kto decyduje o wysokości zarobków w samorządach.'
Only fasttext recognises that word 'pajacu' is strongly negative. Transformer sees it as positive and TFIDF as slightly negative. Also, fasttext understands that 'Ty' is often a part of bullying. For rest negative were ustawę, dowiesz. TFIDF was almost sure that this sentence have no CB because of words poczytaj, dowiesz, decyduje, wysokości.Transformer was not really sure (0.52 vs 0.48) but chose NO CB class and I don't understand its choices - it trats 'pajacu' and 'ustawę' as positive words and 'Dokładnie, wie, mówi' as positive ones.
TN '@anonymized_account Spoko, jak im Duda z Morawieckim zamówią po pięć piw to wszystko będzie ok.'
Fasttext - ok words: spoko, Morawieckim(¯_(ツ)_/¯), będzie, ok; not ok: Duda - seems almost logical to me
Transformer - ok words: Morawieckim, zamówią, będzie; not ok: anonymyzed account, spoko
TFIDF - ok: Morawieckim, pięć, ok; not ok: z, będzie, Duda
Every model here recognised this sentence as NO CB so it was ok, and they had similar premises. Also, it seems that there were more negative comments about Mr. Duda than Mr. Morawiecki in 2019 Twitter ;) (or - to be more precise - in this dataset)
FP '@anonymized_account @anonymized_account Kto mieczem wojuje, ten od pochwy ginie'
TFIDF - I got some peculiar results as model was sure that it had CB (100%) but word analysis shows that every single word in this sensentce has positive impact (I mean - it has no bulying).
TRANSFORMER - not ok words: mieczem, ginie, wojuje - violent words; ok: kto, od, pochwy
FASTTEXT - model was not really sure (0.49 vs 0.51) if it has cyberbullying. Wojuje has the most negative impact and only 2 words positive (ten, od).
FN @anonymized_account Tej szmaty się nie komentuje
FASTTEXT - Model is 100% sure that every word in the sentence shows that this sentence has no CB
TRANSFORMER - only 'komentuje' word is negative, 'szmaty' is the most positive word for the model
TFIDF - word 'komentuje' is the only valuable word and it shows that sentence has no CB
TASK 2:
FASTTEXT:
TFIDF:
TRANSFORMER - this score should not be taken into consideration - it has REALLY low score and LIME shows almost nothing.
Strongs and weaknesses
Not really, performance depends also of time - transformer learning was a nightmare (as I didn't have GPU and did it on google colab which have come usage constraints) and after about 5 hrs of learning it had better performance than TFIDF but I'm not really sure if it was worth it. Not for uni labs. No.
Yet, it is important if model can explain itself (with LIME ofc). Like, for transformer, words seemed to have really low impact on score - (like 10e-8)(I don't know what had, really, maybe that were those words but just scoring was strange, or I did a mistake).
Next thing - some models can be better in other tasks - it is shown on Kley Leaderboard.
And - very much depends on dataset (instances no, class no, sentences length, repeated words (like anonymised_accout)).
It depended.